### Replication code
### Article: "Turnover: How lame-duck governments disrupt the bureaucracy and service delivery before leaving office"
### Author: Guillermo Toral (www.guillermotoral.com)
### Date: July 2023
### This file scrapes the news site at the website of the Sao Paulo Prosecutor's Office (MPSP, Ministério Público do estado de São Paulo)
### R version, platform, and package versions reported at the end of the file

# Prepare the environment -------------------------------------------------

### This section of the code prepares the environment 

# Clean the environment
rm(list = ls())

# Install required packages if not previously installed
package_list <- c("tidyverse", "polite", "rvest", "codebook") 
packages_to_install <- package_list[!(package_list %in% installed.packages()[,"Package"])]
if(length(packages_to_install)>0){
  install.packages(packages_to_install)
}

# Load required packages
library(tidyverse)
library(polite)
library(rvest)
library(codebook)

# Define function to scrape links from the MPSP site search results
scraplinks <- function(url){
  webpage <- xml2::read_html(url)
  url_ <- webpage %>%
    rvest::html_nodes("a") %>%
    rvest::html_attr("href")
  links <- url_[grepl("mpsp.mp.br/w/", url_, fixed = TRUE)]
  return(links)
}

# Set Working Directory to wherever this file is located.
setwd(here())


# Scrape the news of the MPSP website --------------------------------------------------------------------

# URL of the page where the MPSP publishes news
url <- "https://mpsp.mp.br/busca?q=%22ex-prefeito%22"

npages <- 115 # number of pages in the search results list
links <- c()
site_validlinks <- c()

for(i in 1:npages){
  page <- paste0(url, "&start=", i)
  results <- scraplinks(page)
  links <- c(links, results)
  site_validlinks <- c(site_validlinks, length(results))
  Sys.sleep(runif(1, 1.5, 6))
}

headline <- c()
date <- c()
text <- c()
link <- c()

for(i in 1:length(links)){
  l <- read_html(links[i])
  n <- l %>% html_elements("p , .mpsp-display__socialmedia-date-span, h1")
  nn <- n %>% html_text2()
  headline <- c(headline, nn[2])
  date <- c(date, nn[3])
  text <- c(text, paste(nn[4:length(nn)], collapse = " "))
  link <- c(link, links[i])
}

mpsp_news <- as_tibble(cbind(headline, date, text, link)) %>%
  mutate(year = as.numeric(paste0("20", substr(date,8,9))),
         # Include dummies for whether the text includes key terms relevant for spending, employment, and corruption charges
         contains_pessoal = grepl("pessoal", text),
         contains_improbidade = grepl("improbidade", text),
         contains_concurs = grepl("concurs", text),
         contains_despesas = grepl("despesas", text),
         contains_servidor = grepl("servidor", text),
         contains_limite = grepl("limite", text),
         contains_contrat = grepl("contrat", text),
         contains_employment_terms = grepl("pessoal | servidor | cargos | concurs | contrat", text))

# remove duplicates
mpsp_news <- mpsp_news %>%
  filter(!duplicated(link))

# Export dataset ----------------------------------------------------------

# This exports the dataset of MPSP news. 
write_csv(mpsp_news, "../../datasets/downloaded/other/mpsp_news.csv")


# Generate a codebook for the dataset with manual codings of each  --------

## The mpsp_news_coded.csv dataset corresponds to mpsp_news as generated by the code above, together with hand-coded variables that were done by a research assistant and by the author

n <- read_csv("../../datasets/analysis/mpsp_news_coded.csv")

var_label(n) <- list(headline = "Headline of the news piece",
                     date = "Date the news piece was published",
                     text = "Main text of the news piece",
                     link = "Link to the news report",
                     ra_relevant = "Indicator for whether the research assistant coded this news piece as relevant to the paper (i.e., concerning violations of public employment laws) (Y = yes, N = no, ? = in doubt, NA = missing)",
                     ra_notes = "Notes added by the research assistant on the topic of the news report (NA = missing)",
                     relevant = "Indicator for whether the author coded this news piece as relevant to the paper (i.e., concerning violations of public employment laws) (1 = yes, NA = no)",
                     municipality = "Municipality the news piece refers to (NA = missing)",
                     topic = "Topic of the news report (NA = missing)",
                     conviction = "Indicator for whether the news piece reports a municipal politician was convicted (Y = yes, N= no, NA = missing)",
                     fine = "Indicator for whether the news piece reports a municipal politician was imposed a fine (Y = yes, N= no, NA = missing)",
                     political_rights = "Indicator for whether the news piece reports a municipal politician had their political rights suspended (Y = yes, N= no, NA = missing)",
                     loss_office = "Indicator for whether the news piece reports a municipal politician lost their office (Y = yes, N= no, NA = missing)",
                     prison = "Indicator for whether the news piece reports a municipal politician was imposed a prison sentence (Y = yes, N= no, NA = missing)",
                     assets_blocked = "Indicator for whether the news piece reports a municipal politician had their assets blocked (Y = yes, N= no, NA = missing)")

codebook::label_browser_static(n)

# Notes: R version, platform, and loaded packages -------------------------

sessionInfo(package = NULL)

# R version 4.2.1 (2022-06-23)
# Platform: aarch64-apple-darwin20 (64-bit)
# Running under: macOS Monterey 12.1
# 
# Matrix products: default
# LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
# 
# locale:
#   [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
# 
# attached base packages:
#   [1] stats     graphics  grDevices utils     datasets  methods   base     
# 
# other attached packages:
#   [1] rvest_1.0.3     polite_0.1.2    forcats_0.5.2   stringr_1.5.0   dplyr_1.1.2    
# [6] purrr_0.3.4     readr_2.1.2     tidyr_1.2.0     tibble_3.2.1    ggplot2_3.3.6  
# [11] tidyverse_1.3.2
# 
# loaded via a namespace (and not attached):
#   [1] tidyselect_1.2.0    haven_2.5.1         gargle_1.2.0        labelled_2.9.1     
# [5] colorspace_2.0-3    vctrs_0.6.2         generics_0.1.3      usethis_2.1.6      
# [9] utf8_1.2.3          rlang_1.1.1         pillar_1.9.0        glue_1.6.2         
# [13] withr_2.5.0         DBI_1.1.3           dbplyr_2.2.1        modelr_0.1.9       
# [17] readxl_1.4.1        lifecycle_1.0.3     munsell_0.5.0       gtable_0.3.1       
# [21] cellranger_1.1.0    memoise_2.0.1       tzdb_0.3.0          fastmap_1.1.0      
# [25] fansi_1.0.4         broom_1.0.1         scales_1.2.1        backports_1.4.1    
# [29] googlesheets4_1.0.1 cachem_1.0.6        jsonlite_1.8.0      fs_1.5.2           
# [33] hms_1.1.2           stringi_1.7.12      codebook_0.9.2      grid_4.2.1         
# [37] cli_3.6.1           tools_4.2.1         magrittr_2.0.3      crayon_1.5.1       
# [41] pkgconfig_2.0.3     robotstxt_0.7.13    ellipsis_0.3.2      xml2_1.3.3         
# [45] reprex_2.0.2        googledrive_2.0.0   lubridate_1.8.0     assertthat_0.2.1   
# [49] httr_1.4.4          rstudioapi_0.14     ratelimitr_0.4.1    R6_2.5.1           
# [53] compiler_4.2.1    